In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from six import StringIO  
from sklearn.tree import plot_tree, DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, precision_score, mean_squared_error

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex

## graphviz
import graphviz 

## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

## pydot
import pydot

import warnings
warnings.filterwarnings("ignore")

The Basics of Decision Trees

Hitters Dataset Example

This dataset can be extracted from ISLR R package.

In [2]:
Hitters = pd.read_csv('Data/Hitters.csv', index_col=0).dropna()
Hitters.head()
Out[2]:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks League Division PutOuts Assists Errors Salary NewLeague
-Alan Ashby 315 81 7 24 38 39 14 3449 835 69 321 414 375 N W 632 43 10 475.0 N
-Alvin Davis 479 130 18 66 72 76 3 1624 457 63 224 266 263 A W 880 82 14 480.0 A
-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225 828 838 354 N E 200 11 3 500.0 N
-Andres Galarraga 321 87 10 39 42 30 2 396 101 12 48 46 33 N E 805 40 4 91.5 N
-Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19 501 336 194 A W 282 421 25 750.0 A
In [3]:
fig = go.Figure()
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=('Salary Histrogram', 'log(Salary) Histrogram'))

# Left, 
fig.add_trace(go.Histogram(x= Hitters['Salary'].values, xbins=dict(size=100), showlegend=False), 1, 1)
# right
fig.add_trace(go.Histogram(x= np.log(Hitters['Salary'].values), xbins=dict(size=0.2), showlegend=False), 1, 2)

# Update
fig.update_xaxes(title_text='Salary', range=[0, 2500], row=1, col=1)
fig.update_yaxes(title_text='Frequency', range=[0, 50], row=1, col=1)
fig.update_xaxes(title_text='log(Salary)', range=[4, 8], row=1, col=2)
fig.update_yaxes(title_text='Frequency', range=[0, 50], row=1, col=2)

fig.update_traces(marker_color='RoyalBlue', marker_line_color='Navy', marker_line_width=1.5, opacity=1, row=1, col=1)
fig.update_traces(marker_color='lightYellow', marker_line_color='darkRed', marker_line_width=1.5, opacity=1, row=1, col=2)

# Background
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)

fig.show()

We would like to define a function that summarized our regressor tree. This can be done using the following packages.

Package Description
sklearn.tree.export_graphviz Exports a decision tree in the DOT format.
StringIO io.StringIO in Python 3
pydot Visualizes the graph
In [4]:
def Tree_Graph(Estimator, Features, Class_Names = None, fill=True):
    dot_data = StringIO()
    export_graphviz(Estimator, out_file = dot_data, feature_names = Features, class_names = Class_Names, filled = fill)
    graph, = pydot.graph_from_dot_data(dot_data.getvalue())
    graph = Image(graph.create_png())
    display(graph)

Regression Trees

The Hitters dataset can be used for predicting a baseball player’s Salary based on Years (the number of years that he has played in the major leagues) and Hits (the number of hits that he made in the previous year).

For the Hitters data, a regression tree for predicting the log salary of a baseball player, based on the number of years that he has played in the major leagues and the number of hits that he made in the previous year. At a given internal node, the label (of the form $X_j < t_k$) indicates the left-hand branch emanating from that split, and the right-hand branch corresponds to $Xj \geq tk$. For instance, the split at the top of the tree results in two large branches. The left-hand branch corresponds to $Years< 4.5$, and the right-hand branch corresponds to $Years\geq 4.5$. The tree has two internal nodes and three-terminal nodes, or leaves. The number in each leaf is the mean of the response for the observations that fall there.

In [5]:
X = Hitters[['Years', 'Hits']].values
y = np.log(Hitters.Salary.values)

reg = DecisionTreeRegressor(max_leaf_nodes = 3)
_ = reg.fit(X, y)

Tree_Graph(reg, Features=['Years', 'Hits'])

Overall, the tree segments the players into three regions of predictor space:

  • Players who have played for four or fewer years: $R_1 =\{X~|~\mbox{Years}<4.5\}$
  • Players who have played for five or more years and who made fewer than 118 hits last year: $R_2 =\{X~|~\mbox{Years}\geq4.5,~\mbox{Hits}<117.5\}$
  • Players who have played for five or more years and who made at least 118 hits last year: $R_3 =\{X~|~\mbox{Years}\geq4.5,~\mbox{Hits}\geq117.5\}$
In [6]:
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
_ = ax.scatter(Hitters['Years'], Hitters['Hits'], facecolors='RoyalBlue', edgecolors='Navy', alpha = 0.4) 
_ = ax.set_xlabel('Years')
_ = ax.set_ylabel('Hits')
_ = ax.set_xticks([1, 4.5, 24])
_ = ax.set_yticks([1, 117.5, 238])
_ = ax.set_xlim([0, 25])
_ = ax.set_ylim([-1, 250])
# Year > = 4.5
_ = ax.vlines(4.5 , 0, 350, linestyles= 'dashed', lw = 1.5, colors='Black')
# Hitts > = 117.5
_ = ax.hlines(117.5 , 4.5, 350, linestyles= 'dashed', lw= 1.5, colors='Black')
_ = ax.annotate(r'$R_1$', xy=(2,117.5), fontsize='xx-large')
_ = ax.annotate(r'$R_2$', xy=(11,60), fontsize='xx-large')
_ = ax.annotate(r'$R_3$', xy=(11,170), fontsize='xx-large')
_ = ax.fill_between(np.linspace(0, 4.5, 200), 1, 238, color='LimeGreen',alpha=.1)
_ = ax.fill_between(np.linspace(4.5, 24, 200), 117.5, 238, color='Purple',alpha=.1)
_ = ax.fill_between(np.linspace(4.5, 24, 200), 1, 117.5, color='Salmon',alpha=.1)

Prediction via Stratification of the Feature Space

There are two steps:

  1. We divide the predictor space—that is, the set of possible values for $X_1$, $X_2$, $\ldots$ , $X_p$ into $J$ distinct and non-overlapping regions, $R_1$, $R_2$, $\ldots$ , $R_J$. This can be done via finding high-dimensional rectangles (boxes) $R_1$, $R_2$, $\ldots$ , $R_J$ that minimize the RSS, given by $$\sum_{j=1}^{J}\sum_{i \in R_j} \left(y_i − \hat{y}_{R_j}\right)^2,$$
  2. For every observation that falls into the region $R_j$, we make the same prediction, which is simply the mean of the response values for the training observations in $R_j$.

In order to perform recursive binary splitting, we consider all predictors $X_1$, $X_2$, $\ldots$ , $X_p$, and all possible values of the cutpoint s for each of the predictors, and then choose the predictor and cutpoint such that the resulting tree has the lowest RSS.

Tree Pruning

From the textbook, we have an algorithm for building a Regression Tree, Algorithm 8.1.

In terms of python implementations, there is a in-depth article by Sckit-learn regarding Decision Trees.


  1. Use recursive binary splitting to grow a large tree on the training data, stopping only when each terminal node has fewer than some minimum number of observations.
  2. Apply cost complexity pruning to the large tree in order to obtain a sequence of best subtrees, as a function of $\alpha$.
$$\underbrace{\sum_{m=1}^{|T|} \sum_{I:~x_i \in R_m} \left(y_i − \hat{y}_{R_m}\right)^2}_{\mbox{RSS}} + \underbrace{\alpha|T|}_{\mbox{Shrinkage penalty}}$$
  1. Use K-fold cross-validation to choose α. That is, divide the training observations into $K$ folds. For each $k$ = $1$, $\ldots$ , $K$:
    • (a) Repeat Steps 1 and 2 on all but the kth fold of the training data.
    • (b) Evaluate the mean squared prediction error on the data in the left-out $k$th fold, as a function of $\alpha$. Average the results for each value of α, and pick α to minimize the average error.
  2. Return the subtree from Step 2 that corresponds to the chosen value of $\alpha$.

Classification Trees

A classification tree is very similar to a regression tree, except that it is classification tree used to predict a qualitative response rather than a quantitative one.

Heart Example Dataset

These data contain a binary outcome AHD for 303 patients who presented with chest pain. An outcome value of Yes indicates the presence of heart disease based on an angiographic test, while No means no heart disease. There are 13 predictors including Age, Sex, Chol (a cholesterol measurement), and other heart and lung function measurements. Cross-validation results in a tree with six terminal nodes.

Dataset available on at this link

In [7]:
Heart = pd.read_csv('http://faculty.marshall.usc.edu/gareth-james/ISL/Heart.csv').drop('Unnamed: 0', axis=1).dropna()
Heart.head()
Out[7]:
Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No

We can use Pandas Factorize to encode categorical variables as follows,

$$\mbox{Chest Pain} = \begin{cases} 0,&\mbox{Typical},\\ 1,&\mbox{Asymptomatic},\\ 2,&\mbox{Non-Anginal},\\ 3,&\mbox{Non-Typical}. \end{cases}, \qquad \mbox{Thal} = \begin{cases} 0,&\mbox{Fixed},\\ 1,&\mbox{Normal},\\ 2,&\mbox{Reversable}. \end{cases}, \qquad \mbox{AHD} = \begin{cases} 0,&\mbox{No},\\ 1,&\mbox{Yes}. \end{cases}. $$
In [8]:
Heart['ChestPain'] = pd.factorize(Heart['ChestPain'])[0]
Heart['Thal'] = pd.factorize(Heart['Thal'])[0]
Heart['AHD'] = pd.factorize(Heart['AHD'])[0]

$X$ and $y$ sets, and sklearn DecisionTreeClassifier

In [9]:
X = Heart.drop('AHD', axis=1)
y = Heart['AHD']

clf = DecisionTreeClassifier(max_depth=None, max_leaf_nodes=6, max_features=3)
_ = clf.fit(X,y)

Tree_Graph(clf, Features=X.columns.tolist(), Class_Names=['No', 'Yes'])

Trees Versus Linear Models

  • Linear regression assumes a model of the form $$f(X) = \beta_0 + \sum_{j=1}^{p}X_j \beta_j$$
  • regression trees assume a model of the form $$f(X) = \sum_{m=1}^{m}c_m.1_{X\in R_M}$$

where $R_1$, . . . , $R_M$ represent a partition of feature space.

Lab

Fitting Classification Trees

Carseats Dataset Example

This dataset can be extracted from ISLR R package.

In [10]:
Carseats = pd.read_csv('Data/Carseats.csv').drop('Unnamed: 0', axis=1).dropna()
Carseats.head()
Out[10]:
Sales CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US
0 9.50 138 73 11 276 120 Bad 42 17 Yes Yes
1 11.22 111 48 16 260 83 Good 65 10 Yes Yes
2 10.06 113 35 10 269 80 Medium 59 12 Yes Yes
3 7.40 117 100 4 466 97 Medium 55 14 Yes Yes
4 4.15 141 64 3 340 128 Bad 38 13 Yes No

We create a variable High which takes on a value of Yes if the Sales variable exceeds 8, and takes on a value of No otherwise.

In [11]:
Carseats['High'] = Carseats['Sales'].map(lambda x: 'Yes' if x > 8 else 'No')

We can encode categorical variables as follows,

$$\mbox{Chest Pain} = \begin{cases} 0,&\mbox{Bad},\\ 1,&\mbox{Medium},\\ 2,&\mbox{Good}. \end{cases}, \qquad \mbox{Urban} = \begin{cases} 0,&\mbox{No},\\ 1,&\mbox{Yes}. \end{cases}, \qquad \mbox{US} = \begin{cases} 0,&\mbox{No},\\ 1,&\mbox{Yes}. \end{cases}, \qquad \mbox{High} = \begin{cases} 0,&\mbox{No},\\ 1,&\mbox{Yes}. \end{cases}. $$
In [12]:
Class_Names = list(np.sort(Carseats['High'].unique()))
#
Carseats['ShelveLoc'] = Carseats['ShelveLoc'].replace({'Bad':0, 'Medium':1, 'Good':2}).astype(int)
Carseats['US'] = pd.factorize(Carseats['US'], sort = True)[0]
Carseats['Urban'] = pd.factorize(Carseats['Urban'], sort = True)[0]
Carseats['High'] = pd.factorize(Carseats['High'], sort = True)[0]

Now,

In [13]:
Carseats.head()
Out[13]:
Sales CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US High
0 9.50 138 73 11 276 120 0 42 17 1 1 1
1 11.22 111 48 16 260 83 2 65 10 1 1 1
2 10.06 113 35 10 269 80 1 59 12 1 1 1
3 7.40 117 100 4 466 97 1 55 14 1 1 0
4 4.15 141 64 3 340 128 0 38 13 1 0 0
In [14]:
X = Carseats.drop(['Sales', 'High'], axis=1)
y = Carseats.High

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

clf = DecisionTreeClassifier(max_depth=6)
_ = clf.fit(X_train, X_test)

y_pred = clf.fit(X_train, y_train).predict(X_test)

Results = pd.DataFrame(classification_report(y_test, y_pred,
                                             target_names = Class_Names, output_dict=True)).T
display(Results.round(2))
precision recall f1-score support
No 0.77 0.84 0.80 118.00
Yes 0.74 0.65 0.69 82.00
accuracy 0.76 0.76 0.76 0.76
macro avg 0.75 0.74 0.75 200.00
weighted avg 0.76 0.76 0.76 200.00
In [15]:
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Confusion Matrix'  + Style.RESET_ALL)
display(pd.DataFrame(data = confusion_matrix(y_test, y_pred), index = Class_Names, columns = Class_Names))

fig, ax = plt.subplots(1, 2, figsize=(14, 4))

_ = plot_confusion_matrix(clf, X_test, y_test, display_labels= Class_Names, cmap= "Blues", normalize= None, ax = ax[0])
_ = ax[0].set_title('Confusion Matrix')

_ = plot_confusion_matrix(clf, X_test, y_test, display_labels= Class_Names, cmap= "Greens", normalize= 'true', ax = ax[1])
_ = ax[1].set_title('Normalized Confusion Matrix')
Confusion Matrix
No Yes
No 99 19
Yes 29 53
In [16]:
Tree_Graph(clf, Features=X.columns.tolist(), Class_Names =  Class_Names)

Fitting Regression Trees

Boston Dataset Example

This dataset can be extracted from ISLR R package.

In [17]:
Boston = pd.read_csv('Data/Boston.csv').drop('Unnamed: 0', axis=1).dropna()
Boston.head()
Out[17]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2
In [18]:
X = Boston.drop('medv', axis=1)
y = Boston.medv

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
In [19]:
reg = DecisionTreeRegressor(max_depth=3)
_ = reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
In [20]:
Tree_Graph(reg, Features=X.columns.tolist())
In [21]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
_ = ax.scatter(y_pred, y_test, label='medv', facecolors='SkyBlue', edgecolors='MidnightBlue', alpha = 0.8)
_ = ax.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
_ = ax.set_xlabel(r'$y_{pred}$')
_ = ax.set_ylabel(r'$y_{test}$')
_ = ax.set_xlim([-1,51])
_ = ax.set_ylim([-1,51])

Refrences